knitr::opts_chunk$set(fig.align = 'center', message = FALSE, warning = FALSE)
library(tufte)
library(readr)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(sqldf)
#Importing data
train <-read.csv("train.csv", header = TRUE)
train
Generalidades de la base de datos train
La base de datos “train” consta de 647,054 observaciones (registros) y siete variables. Cada registro representa la compra de un producto y contiene los siguentes campos:
| Variable | Description |
|---|---|
| TripTypr | Identificador categórico que representa el tipo de visita que realizó el cliente |
| VisitNumber | Número identificador de la visita a la que pertenece la compra. |
| Weekday | Día de la semana en el cuál se realizó la compra |
| Upc | Identificador del producto comprado |
| ScanCount | Número del unidades compradas de cada producto; un valor negativo significa que el producto fue devuelto |
| DepartmentDescription | Descripción del departamento de origen del producto |
| FinelineNumber | Variable categórica creada por Walmart, mediante el cuál se clasifican los productos en diversas categorías |
glimpse(train)
## Observations: 647,054
## Variables: 7
## $ TripType <int> 999, 30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26…
## $ VisitNumber <int> 5, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,…
## $ Weekday <fct> Friday, Friday, Friday, Friday, Friday, Friday,…
## $ Upc <dbl> 68113152929, 60538815980, 7410811099, 223840351…
## $ ScanCount <int> -1, 1, 1, 2, 2, 2, 1, 1, 1, -1, 1, 2, 1, 1, -1,…
## $ DepartmentDescription <fct> FINANCIAL SERVICES, SHOES, PERSONAL CARE, PAINT…
## $ FinelineNumber <int> 1000, 8931, 4504, 3565, 1017, 1017, 1017, 2802,…
Dos de las siete variables de la base de datos (Upc, FinelineNumber), tienen valores "NA" en sus registros: cada uno 4129 valores faltantes.
En pasos futuros, procederemos a la imputación de los datos faltantes; no obstante lo anterior, ~4000 valores faltantes no representa un número grande, comparado con el total de registros: 647,054.
summary(train)
## TripType VisitNumber Weekday Upc
## Min. : 3.00 Min. : 5 Friday : 96247 Min. :8.340e+02
## 1st Qu.: 27.00 1st Qu.: 49268 Monday : 83130 1st Qu.:3.400e+09
## Median : 39.00 Median : 97074 Saturday :122096 Median :7.050e+09
## Mean : 58.58 Mean : 96168 Sunday :133975 Mean :3.061e+10
## 3rd Qu.: 40.00 3rd Qu.:144316 Thursday : 67962 3rd Qu.:3.007e+10
## Max. :999.00 Max. :191347 Tuesday : 72529 Max. :9.790e+11
## Wednesday: 71115 NA's :4129
## ScanCount DepartmentDescription FinelineNumber
## Min. :-12.000 GROCERY DRY GOODS : 70402 Min. : 0
## 1st Qu.: 1.000 DSD GROCERY : 68332 1st Qu.:1404
## Median : 1.000 PRODUCE : 51115 Median :3352
## Mean : 1.109 DAIRY : 43820 Mean :3727
## 3rd Qu.: 1.000 PERSONAL CARE : 41969 3rd Qu.:5501
## Max. : 71.000 IMPULSE MERCHANDISE: 28712 Max. :9998
## (Other) :342704 NA's :4129
train$TripType<-as.factor(train$TripType)
Ahora, sabemos que los registros en la base de datos representa el número de compras en Walmart, no así el número de visitas; entonces, para el saber el número de valores únicos en cada una de las columnas, aplicamos la siguientes funciones.
train<-as_tibble(train)
#to see how many distinct values each variable has
train %>% summarise_all(list(~n_distinct(.)))
El resultado es que que en realidad, sólo existen:
Revisemos cada uno de los valores únicos de las distintas variables:
unique(train$DepartmentDescription)
## [1] FINANCIAL SERVICES SHOES
## [3] PERSONAL CARE PAINT AND ACCESSORIES
## [5] DSD GROCERY MEAT - FRESH & FROZEN
## [7] DAIRY PETS AND SUPPLIES
## [9] HOUSEHOLD CHEMICALS/SUPP NULL
## [11] IMPULSE MERCHANDISE PRODUCE
## [13] CANDY, TOBACCO, COOKIES GROCERY DRY GOODS
## [15] BOYS WEAR FABRICS AND CRAFTS
## [17] JEWELRY AND SUNGLASSES MENS WEAR
## [19] ACCESSORIES HOME MANAGEMENT
## [21] FROZEN FOODS SERVICE DELI
## [23] INFANT CONSUMABLE HARDLINES PRE PACKED DELI
## [25] COOK AND DINE PHARMACY OTC
## [27] LADIESWEAR COMM BREAD
## [29] BAKERY HOUSEHOLD PAPER GOODS
## [31] CELEBRATION HARDWARE
## [33] BEAUTY AUTOMOTIVE
## [35] BOOKS AND MAGAZINES SEAFOOD
## [37] OFFICE SUPPLIES LAWN AND GARDEN
## [39] SHEER HOSIERY WIRELESS
## [41] BEDDING BATH AND SHOWER
## [43] HORTICULTURE AND ACCESS HOME DECOR
## [45] TOYS INFANT APPAREL
## [47] LADIES SOCKS PLUS AND MATERNITY
## [49] ELECTRONICS GIRLS WEAR, 4-6X AND 7-14
## [51] BRAS & SHAPEWEAR LIQUOR,WINE,BEER
## [53] SLEEPWEAR/FOUNDATIONS CAMERAS AND SUPPLIES
## [55] SPORTING GOODS PLAYERS AND ELECTRONICS
## [57] PHARMACY RX MENSWEAR
## [59] OPTICAL - FRAMES SWIMWEAR/OUTERWEAR
## [61] OTHER DEPARTMENTS MEDIA AND GAMING
## [63] FURNITURE OPTICAL - LENSES
## [65] SEASONAL LARGE HOUSEHOLD GOODS
## [67] 1-HR PHOTO CONCEPT STORES
## [69] HEALTH AND BEAUTY AIDS
## 69 Levels: 1-HR PHOTO ACCESSORIES AUTOMOTIVE BAKERY BATH AND SHOWER ... WIRELESS
unique(train$ScanCount)
## [1] -1 1 2 3 5 6 4 14 -2 9 7 10 8 -3 -5 11 16 -4 13
## [20] 15 30 -6 12 20 -12 19 46 23 -7 22 25 24 31 -9 51 17 18 71
## [39] -10
Encontramos que a pesar de que aparentemente la variable DepartmentDescription no tiene missing values, en realidad sí tiene valores faltantes registrados como NUll. Veamos cuántos registros son NULL.
filter(train, DepartmentDescription=="NULL") %>% dim()
## [1] 1361 7
Ok! La variableDepartmentDescription tiene con 1361 valores faltantes. Además, vemos que entre sus categorías existe las siguientes dos categorías que probablemente corresponden a la misma categoría: MENS WEAR y MENSWEAR.
filter(train, DepartmentDescription=="MENS WEAR") %>% dim()
## [1] 12250 7
filter(train, DepartmentDescription=="MENSWEAR") %>% dim()
## [1] 338 7
Para fácil análisis, cambiamos los datos “NULL” a “Na”s y homogeneizamos MENS WEAR y MENSWEAR en una sola categoría: MENS WEAR.
#Limpiando los datos
train <-train %>%
mutate(DepartmentDescription= replace(DepartmentDescription, DepartmentDescription == 'MENSWEAR', "MENS WEAR"))
train <-train %>%
mutate(DepartmentDescription= replace(DepartmentDescription, DepartmentDescription == "NULL", NA))
summary(train)
## TripType VisitNumber Weekday Upc
## 40 :174164 Min. : 5 Friday : 96247 Min. :8.340e+02
## 39 : 95504 1st Qu.: 49268 Monday : 83130 1st Qu.:3.400e+09
## 37 : 38954 Median : 97074 Saturday :122096 Median :7.050e+09
## 38 : 29565 Mean : 96168 Sunday :133975 Mean :3.061e+10
## 25 : 27609 3rd Qu.:144316 Thursday : 67962 3rd Qu.:3.007e+10
## 7 : 23199 Max. :191347 Tuesday : 72529 Max. :9.790e+11
## (Other):258059 Wednesday: 71115 NA's :4129
## ScanCount DepartmentDescription FinelineNumber
## Min. :-12.000 GROCERY DRY GOODS: 70402 Min. : 0
## 1st Qu.: 1.000 DSD GROCERY : 68332 1st Qu.:1404
## Median : 1.000 PRODUCE : 51115 Median :3352
## Mean : 1.109 DAIRY : 43820 Mean :3727
## 3rd Qu.: 1.000 PERSONAL CARE : 41969 3rd Qu.:5501
## Max. : 71.000 (Other) :370055 Max. :9998
## NA's : 1361 NA's :4129
filter(train, DepartmentDescription=="MENSWEAR") %>% dim()
## [1] 0 7
train %>% summarise_all(list(~n_distinct(.)))
#Histograms of DepartmentDescription by Weekday
subseted_train = 0
plot = 0
for (i in unique(train$DepartmentDescription)) {
subseted_train = subset(train, DepartmentDescription == i)
plot <- ggplot(subseted_train, aes(x = Weekday)) +
geom_histogram(stat="count") +
labs(y= "Total Count", x = "Weekday", title=paste0(i, " by Weekday"), subtitle = "Counts by Number of Purchases")+
coord_flip()
print(plot)
}
#Histograms of Triptypes by Weekday
subseted_train = 0
plot = 0
for (i in unique(train$TripType)) {
subseted_train = subset(train, TripType == i)
plot <- ggplot(subseted_train, aes(x = Weekday)) +
geom_histogram(stat="count") +
ggtitle(paste0(i, " Histogram by Weekday")) +
labs(y= "Total Count", x = "Weekday")+
coord_flip()
print(plot)
}
#Histograms of Triptypes by Weekday
subseted_train = 0
plot = 0
for (i in unique(train$DepartmentDescription)) {
subseted_train = subset(train, DepartmentDescription == i)
plot <- ggplot(subseted_train, aes(x = TripType)) +
geom_bar(stat="count") +
ggtitle(paste0(i, " Histogram by TripType")) +
labs(y= "Total Count", x = "TripType")+
coord_flip()
print(plot)
}
subseted_train = 0
plot = 0
for (i in unique(train$Weekday)) {
subseted_train = subset(train, Weekday == i)
plot <- ggplot(subseted_train, aes(x = TripType)) +
geom_bar(stat="count") +
ggtitle(paste0(i, " Histogram by TripType")) +
labs(y= "Total Count", x = "TripType")+
coord_flip()
print(plot)
}
subseted_train = 0
plot = 0
for (i in unique(train$Weekday)) {
subseted_train = subset(train, Weekday == i)
plot <- ggplot(subseted_train, aes(x = DepartmentDescription)) +
geom_bar(stat="count") +
ggtitle(paste0(i, " Histogram by DepartmentDescription")) +
labs(y= "Total Count", x = "DepartmentDescription")+
coord_flip() +
theme(axis.text = element_text(size = 8))
print(plot)
}